Processing methylation data

Here we will use an existing methylation dataset - 450k in cord blood (15 samples)
To read in your own dataset (usually “idat files” ending in .idat)
See the help for ?read.metharray.exp

Load packages that we will use focusing on minfi:
see Aryee et al. Bioinformatics 2014. http://doi.org/10.1093/bioinformatics/btu049
Other popular options for processing & analyzing methylation data include RnBeads and methylumi

suppressMessages(library(minfi)) # popular package for methylation data
library(FlowSorted.CordBlood.450k) # example dataset
library(pryr) # for monitoring memory use
library(ENmix) # probe type adjustment "rcp"
## Loading required package: doParallel
suppressMessages(require(sva)) # for addressing batch effects

bring a 450k dataset in to memory (from eponymous BioC package above)
see Bakulski et al. Epigenetics 2016.

data(FlowSorted.CordBlood.450k)

because it is flow sorted - the authors give us the cell types
here we show the frequencies:

table(pData(FlowSorted.CordBlood.450k)$CellType)
## 
##      Bcell       CD4T       CD8T       Gran       Mono         NK 
##         15         15         14         12         15         14 
##       nRBC WholeBlood 
##          4         15
# subset to just the Whole Blood samples since this is the most common for epi studies
WB <- FlowSorted.CordBlood.450k[, pData(FlowSorted.CordBlood.450k)$CellType == "WholeBlood"]
ncol(WB)
## Samples 
##      15

we need to change the sampleName attribute here just because we are using a reference and these samples are otherwise in the reference dataset when we want to estimate their composition.

# your personal samples won't need to be renamed
sampleNames(WB) <- 1:15

Look at the attributes of this dataset
It is stored as a RGChannelSet which means it is not yet processed (red and green signals stored separately)

WB
## RGChannelSet (storageMode: lockedEnvironment)
## assayData: 622399 features, 15 samples 
##   element names: Green, Red 
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 1 2 ... 15 (15 total)
##   varLabels: X Plate_ID ... CellType (13 total)
##   varMetadata: labelDescription
## Annotation
##   array: IlluminaHumanMethylation450k
##   annotation: ilmn12.hg19

Estimate cell proportions

Estimating proportions of 7 cell types found in cord blood (note the nucleated Red Blood Cells) This next command is commented out because it requires >4GB of RAM if you don’t have that - you can load the presaved output below

# cellprop <- estimateCellCounts(WB, compositeCellType = "CordBlood",
#   cellTypes = c("CD8T","CD4T", "NK","Bcell","Mono","Gran", "nRBC"))
# write.csv(cellprop, file = "data/cellprop_WB_15samps_bakulski2016.csv", row.names = F)

read in the estimated cell proportions:

cellprop <- read.csv("data/cellprop_WB_15samps_bakulski2016.csv")

drop the reference dataset from memory

rm(FlowSorted.CordBlood.450k)

Here are the estimates

knitr::kable(cellprop, digits = 2)
CD8T CD4T NK Bcell Mono Gran nRBC
0.13 0.22 0.00 0.07 0.08 0.44 0.09
0.11 0.08 0.00 0.08 0.10 0.55 0.11
0.15 0.10 0.05 0.05 0.08 0.53 0.07
0.25 0.14 0.02 0.12 0.07 0.34 0.08
0.18 0.20 0.00 0.06 0.06 0.44 0.09
0.16 0.11 0.00 0.08 0.11 0.54 0.06
0.18 0.19 0.00 0.08 0.04 0.48 0.07
0.14 0.07 0.00 0.07 0.11 0.58 0.05
0.11 0.10 0.00 0.06 0.09 0.61 0.04
0.16 0.13 0.08 0.08 0.04 0.22 0.32
0.18 0.15 0.00 0.13 0.06 0.38 0.19
0.09 0.10 0.00 0.04 0.06 0.42 0.29
0.12 0.04 0.00 0.09 0.10 0.60 0.05
0.20 0.17 0.00 0.05 0.06 0.47 0.06
0.13 0.15 0.00 0.14 0.06 0.38 0.17

note that they are close to summing to 1

summary(rowSums(cellprop))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.002   1.013   1.021   1.026   1.031   1.099

Distribution of estimated cell types

boxplot(cellprop*100, col=1:ncol(cellprop),xlab="Cell type",ylab="Estimated %")

Preprocessing the data

Preprocess the data - this removes technical variation
There are several popular methods including intra- and inter-sample normalizations
Here we demonstrate an effective and lightweight approach:
“Normal out of band background” (Noob) within-sample correction - see Triche et al 2013

system.time(WB.noob <- preprocessNoob(WB))
## Loading required package: IlluminaHumanMethylation450kmanifest
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
## [preprocessNoob] Using sample number 6 as reference level...
##    user  system elapsed 
##  19.957   3.453  24.550

We see the resulting object is now a MethylSet (because the RGset has been preprocessed) Minfi is incorrectly saying the data are still raw - we verify this is not true below

WB.noob
## MethylSet (storageMode: lockedEnvironment)
## assayData: 485512 features, 15 samples 
##   element names: Meth, Unmeth 
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 1 2 ... 15 (15 total)
##   varLabels: X Plate_ID ... CellType (13 total)
##   varMetadata: labelDescription
## Annotation
##   array: IlluminaHumanMethylation450k
##   annotation: ilmn12.hg19
## Preprocessing
##   Method: Raw (no normalization or bg correction)
##   minfi version: 1.18.6
##   Manifest version: 0.4.0

we can look at a few methylation values on the fly and see that the preprocessing changed them

# first three CpGs on the first three samples
# raw RGset
print(getBeta(WB)[1:3,1:3], digits = 2)
##               1    2     3
## cg00050873 0.56 0.54 0.838
## cg00212031 0.61 0.80 0.017
## cg00213748 0.32 0.49 0.912
# after preprocessing
print(getBeta(WB.noob)[1:3,1:3], digits = 2)
##               1    2     3
## cg00050873 0.51 0.51 0.871
## cg00212031 0.52 0.54 0.023
## cg00213748 0.47 0.50 0.904

Distribution of beta-values: before and after normalization

densityPlot(WB, main = "density plots before and after preprocessing", pal="blue")
densityPlot(WB.noob, add = F, pal = "magenta")
# Add legend
legend("topright", c("Noob","Raw"), 
  lty=c(1,1), title="Normalization", 
  bty='n', cex=0.8, col=c("magenta","blue"))

notice the blue density traces (raw) are more spread out; background correction brings them together
## probe failures due to low intensities We want to drop probes with intensity that is not significantly above background signal (from negative control probes)

detect.p <- detectionP(WB, type = "m+u")

Count of failed probes per sample P>0.01 (i.e. not significant compared to background signal)

knitr::kable(t(as.matrix(colSums(detect.p > 0.01))))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
452 369 108 134 409 226 167 341 372 313 237 396 420 95 119

This is less than 1% of probes per sample
Total number of failed measures (in all probes)

sum(detect.p > 0.01)
## [1] 4158

Restrict data to good probes only:

detect.p[detect.p > 0.01] <- NA
detect.p <- na.omit(detect.p)
intersect <- intersect(rownames(getAnnotation(WB)), rownames(detect.p))
length(intersect)
## [1] 483916

Filter bad probes from our methylset

nrow(WB.noob)
## Features 
##   485512
WB.noob <- WB.noob[rownames(getAnnotation(WB.noob)) %in% intersect,]
nrow(WB.noob)
## Features 
##   483916
# cleanup
rm(intersect, detect.p, WB)

Probe type adjustment

Need to adjust for probe-type bias Infinium I (type I) and Infinium II (type II) probes

## RCP with EnMix: Regression on Correlated Probes (Niu et al. Bioinformatics 2016)
betas.rcp <- rcp(WB.noob)
dim(betas.rcp)
## [1] 483916     15

note that this package takes beta values out of the minfi object - result is a matrix

class(betas.rcp) 
## [1] "matrix"
## Annotation of Infinium type for each probe (I vs II)
typeI <-   minfi::getProbeInfo(WB.noob,type="I")$Name
typeII <-  minfi::getProbeInfo(WB.noob,type="II")$Name
onetwo <- rep(1, nrow(betas.rcp))
onetwo[rownames(betas.rcp) %in% typeII] <- 2
# almost three quarter of our probes are type II
knitr::kable(t(table(onetwo)))
1 2
135078 348838

Density plots by Infinium type: before and after RCP calibration
Probe-type bias adjustment before and after RCP

par(mfrow=c(1,2)) # Side-by-side density distributions 
densityPlot(WB.noob[rownames(getAnnotation(WB.noob)) %in% typeI,],pal = "red",main='Beta density')
densityPlot(WB.noob[rownames(getAnnotation(WB.noob)) %in% typeII,],add = F, pal = "blue")
densityPlot(betas.rcp[rownames(getAnnotation(WB.noob)) %in% typeI,],pal = "red",main='Beta density probe-type adjusted')
densityPlot(betas.rcp[rownames(getAnnotation(WB.noob)) %in% typeII,],add = F, pal = "blue")
legend("topright", c("Infinium I","Infinium II"), 
       lty=c(1,1), title="Infinium type", 
       bty='n',col=c("red","blue"))

notice that the type I and II peaks are more closely aligned after rcp adjustment
(particularly in the higher peak)

rm(onetwo, typeI, typeII)

Batch effects

As an example of an observable batch effect, samples are processed in plates (e.g. bisulfite converting 96 at a time).
This can create batch effects (technical variation) with different intensities by plate.
Other commonly observed batch effects include the position on the chip (e.g. the row effect).
Let’s check if samples were on different plates in these data:

knitr::kable(t(as.matrix(table(pData(WB.noob)$Plate_ID))), col.names = c("Plate 1","Plate2"))
Plate 1 Plate2
6 9

Principal Component Analysis (PCA)

Calculate major sources of variability of DNA methylation using PCA

PCobject = prcomp(t(betas.rcp), retx = T, center = T, scale. = T)

Extract the Principal Components from SVD

PCs <- PCobject$x

Proportion of variance explained by each additional PC

cummvar <- summary(PCobject)$importance["Cumulative Proportion", 1:10]
knitr::kable(t(as.matrix(cummvar)),digits = 2)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
0.4 0.53 0.59 0.65 0.7 0.74 0.78 0.81 0.85 0.88

Is the major source of variability associated with sample plate?

boxplot(PCs[,1]~pData(WB.noob)$Plate_ID,
        xlab = "Sample Plate",ylab="PC1",
        col=c("red","blue"))

t.test(PCs[,1]~pData(WB.noob)$Plate_ID)
## 
##  Welch Two Sample t-test
## 
## data:  PCs[, 1] by pData(WB.noob)$Plate_ID
## t = -2.0899, df = 12.409, p-value = 0.05784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -804.81998   15.29786
## sample estimates:
## mean in group 1 mean in group 2 
##       -236.8566        157.9044

Removing batch effects using ComBat from the sva package

# First we convert from beta-values to M-values
Mvals <- log2(betas.rcp)-log2(1-betas.rcp)

ComBat eBayes adjustment using a known variable of interest (here we use plate)

Mvals.ComBat <- ComBat(Mvals, batch = pData(WB.noob)$Plate_ID)
## Found 2 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# Convert M-values back to beta-values
betas.rcp <- 2^Mvals.ComBat/(1+2^Mvals.ComBat)

PCA after removing batch effects

PCobject <- prcomp(t(betas.rcp), retx = T, center = T, scale. = T)
PCs <- PCobject$x
cummvar <- summary(PCobject)$importance["Cumulative Proportion", 1:10]

Proportion of variance explained by each additional PC

knitr::kable(t(as.matrix(cummvar)),digits = 2)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
0.37 0.5 0.57 0.64 0.69 0.74 0.78 0.82 0.86 0.89

The first PC is no longer associated with sample plate

boxplot(PCs[,1] ~ pData(WB.noob)$Plate_ID,
        xlab = "Sample Plate", ylab = "PC1",
        col = c("red","blue"))

t.test(PCs[,1] ~ pData(WB.noob)$Plate_ID)
## 
##  Welch Two Sample t-test
## 
## data:  PCs[, 1] by pData(WB.noob)$Plate_ID
## t = -0.91292, df = 12.904, p-value = 0.378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -645.1705  262.0809
## sample estimates:
## mean in group 1 mean in group 2 
##      -114.92688        76.61792

ComBat removed the apparent batch effect

#cleanup
rm(PCs, Mvals, cummvar, PCobject)

memory usage

as a reminder - these are large datasets and we are working in RAM.
Check memory usage with:

pryr::mem_used()
## 1.13 GB
# this command will check the maximum memory usage on Windows
memory.size(max = T)
## Warning: 'memory.size()' is Windows-specific
## [1] Inf

End of script 1

Analyze methylation data

Using data preprocessed in our script:
meth01_process_data.R

we have a processed dataset with 15 samples (otherwise we run script 01)

if(!exists("WB.noob")){
  source("code/meth01_process_data.R")
}
dim(WB.noob)
## Features  Samples 
##   483916       15

load packages

suppressPackageStartupMessages({
  library(CpGassoc)
  library(data.table)
  library(qqman)
  library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
  library(DMRcate)
})

predict sex from methylation

Gbeta <- mapToGenome(WB.noob)  #map to the genome

getSex predicts sex based on X and Y chromosome methylation intensity

getSex(Gbeta) 
## DataFrame with 15 rows and 3 columns
##          xMed      yMed predictedSex
##     <numeric> <numeric>  <character>
## 1    13.81941  9.532875            F
## 2    13.85021  9.676420            F
## 3    13.23687 13.600281            M
## 4    13.29954 13.616983            M
## 5    13.69979  9.649878            F
## ...       ...       ...          ...
## 11   13.23359 13.552924            M
## 12   13.78387 10.039801            F
## 13   13.78433  9.997809            F
## 14   13.33468 13.610287            M
## 15   13.24118 13.561319            M

we see that our predictions match the phenodata

table(pData(WB.noob)$Sex,getSex(Gbeta)$predictedSex)
##    
##     F M
##   F 8 0
##   M 0 7
rm(Gbeta)

consolidate our phenodata

pheno <- as.data.frame(cbind(Sex=pData(WB.noob)$Sex, Plate_ID=pData(WB.noob)$Plate_ID, cellprop))

1 female, 2 male

# cleanup
rm(WB.noob)

quick check of the distribution of gender between plates

counts <- table(pheno[,"Sex"], pheno[,"Plate_ID"])
Percentage <- prop.table(counts, 2); 
par(mfrow = c(1, 1))
barplot(Percentage, main = "Distribution of sex within plates",
        xlab = "plate", col = c("grey","white"), 
        legend = c("F","M"), args.legend = list(x = "topleft"));

rm(counts, Percentage)
pheno[,"Sex"] <- ifelse(as.numeric(pheno[,"Sex"])==2,0,1)

1 female, 0 male ## Cleaning up the methylation data Filters a matrix of beta values by distance to SNP. Also removes cross-reactive probes and sex-chromosome probes.

dim(betas.rcp)
## [1] 483916     15
betas.clean <- rmSNPandCH(betas.rcp,  dist = 2, mafcut = 0.05, and = TRUE, rmcrosshyb = TRUE, rmXY= TRUE)
nCpG <- nrow(betas.clean)
nCpG
## [1] 427140

Running an Epigenome Wide Association

Here we run an EWAS on sex (as a predictor of methylation) first we can run a linear regression on a single cpg

j <- 133211

CpG.level <- betas.clean[j,]
CpG.name <- rownames(betas.clean)[j]
CpG.name
## [1] "cg12691488"

difference in methylation between males and females for this CpG

knitr::kable(cbind(Min=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],min)),3),
                   Mean=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],mean)),3), 
                   Median=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],median)),3),
                   Max=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],max)),3),
                   SD=round(simplify2array(tapply(CpG.level, pheno[,"Sex"],sd)),3),
                   N=table(pheno[,"Sex"])))
Min Mean Median Max SD N
0 0.265 0.302 0.301 0.357 0.031 7
1 0.034 0.049 0.051 0.065 0.010 8
boxplot(CpG.level ~ pheno[,"Sex"])

EWAS and results using CpGassoc

sex as predictor
note that CpGassoc is quite fast for running almost a half million regressions!

system.time(results1 <- cpg.assoc(betas.clean, pheno[,"Sex"]))
##    user  system elapsed 
##  28.161   0.888  29.433

there are several components of the results

class(results1)
## [1] "cpg"
names(results1)
## [1] "results"      "Holm.sig"     "FDR.sig"      "info"        
## [5] "indep"        "covariates"   "chip"         "coefficients"

look at a few results
here effect size is ~ mean difference in methylation proportion

head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3]))
##             effect.size   std.error    P.value
## cg00000957 -0.017918138 0.010723928 0.11863535
## cg00001349 -0.035955083 0.021925575 0.12499334
## cg00001583  0.003733452 0.001324399 0.01449336
## cg00002028  0.004715775 0.002219063 0.05332134
## cg00002719  0.003519761 0.001548240 0.04061259
## cg00002837  0.020332819 0.022808189 0.38887927

and the top hits

head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3])[order(results1$results[,3]),])
##            effect.size   std.error      P.value
## cg03691818  0.08103594 0.003434072 4.668336e-12
## cg12691488 -0.25262909 0.011492007 1.148608e-11
## cg11643285  0.15339625 0.008407993 1.206993e-10
## cg25304146 -0.11498560 0.009167943 1.227870e-08
## cg02989351  0.05057124 0.004635112 6.487831e-08
## cg25294185 -0.08147197 0.007731097 9.760913e-08

check with previous result on our selected CpG (running lm without CpGassoc)

cbind(results1$coefficients[j,4:5], results1$results[j,c(1,3)])
##            effect.size  std.error CPG.Labels      P.value
## cg12691488  -0.2526291 0.01149201 cg12691488 1.148608e-11
summary(lm(CpG.level~pheno[,"Sex"]))
## 
## Call:
## lm(formula = CpG.level ~ pheno[, "Sex"])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036900 -0.013928 -0.000714  0.005826  0.055298 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.302080   0.008393   35.99 2.09e-14 ***
## pheno[, "Sex"] -0.252629   0.011492  -21.98 1.15e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0222 on 13 degrees of freedom
## Multiple R-squared:  0.9738, Adjusted R-squared:  0.9718 
## F-statistic: 483.3 on 1 and 13 DF,  p-value: 1.149e-11

Bonferroni significant hits

table(results1$results[,3] < 0.05/(nCpG))
## 
##  FALSE   TRUE 
## 427133      7

FDR significant hits

table(results1$results[,5] < 0.05)
## 
##  FALSE   TRUE 
## 427125     15

EWAS with adjustment for cell types

results2 <- cpg.assoc(betas.clean,pheno[,"Sex"], 
                      covariates=pheno[,"CD8T"]+ pheno[,"CD4T"] +  
                        pheno[,"NK"]  + pheno[,"Bcell"] + 
                        pheno[,"Mono"] + pheno[,"Gran"] + pheno[,"nRBC"])

look at the results

head(cbind(results2$coefficients[,4:5], P.value=results2$results[,3])[order(results2$results[,3]),])
##            effect.size   std.error      P.value
## cg03691818  0.07876081 0.003921956 1.329699e-10
## cg11643285  0.16343613 0.008317394 1.713980e-10
## cg12691488 -0.25965655 0.013232169 1.741475e-10
## cg25294185 -0.09216462 0.006991428 1.685787e-08
## cg25304146 -0.11846399 0.010854210 1.381006e-07
## cg23739457  0.08311714 0.008908057 7.531446e-07

Bonferroni significant hits

table(results2$results[,3] < 0.05/(nCpG))
## 
##  FALSE   TRUE 
## 427136      4

FDR significant hits

table(results2$results[,5] < 0.05)
## 
##  FALSE   TRUE 
## 427135      5

we can also see them with:

results2$FDR.sig
##        CPG.Labels T.statistic      P.value Holm.sig          FDR
## 70962  cg25294185   -13.18252 1.685787e-08     TRUE 1.800167e-03
## 72477  cg03691818    20.08202 1.329699e-10     TRUE 2.479512e-05
## 133211 cg12691488   -19.62313 1.741475e-10     TRUE 2.479512e-05
## 180139 cg11643285    19.64992 1.713980e-10     TRUE 2.479512e-05
## 397058 cg25304146   -10.91411 1.381006e-07    FALSE 1.179766e-02
##          gc.p.value
## 70962  7.752271e-08
## 72477  6.503115e-10
## 133211 8.497422e-10
## 180139 8.364421e-10
## 397058 6.064966e-07

qqplot and lambda interpretation

par(mfrow=c(1,1))
plot(results1, main="QQ plot for association between methylation and sex")

plot(results2, main="QQ plot for association between methylation and sex \n adjusted for cells proportion")

Lambda - this is a summary of genomic inflation
ratio of observed vs expected median p-value - is there early departure of the qqline?
estimated at -log10(0.5) ~ 0.3 on the x-axis of a qqplot

lambda <- function(p) median(qchisq(p, df=1, lower.tail=FALSE), na.rm=TRUE) / qchisq(0.5, df=1)

Lambda for the first EWAS

lambda(results1$results[,3])
## [1] 3.201929

Lambda after cell type adjustment

lambda(results2$results[,3])
## [1] 1.230098

Volcano Plot-results2 with Bonferroni threshold and current FDR

plot(results2$coefficients[,4],-log10(results2$results[,3]), 
     xlab="Estimate", ylab="-log10(Pvalue)", main="Volcano Plot- adj for CellProp")
abline(h = -log10(0.05/(nCpG)), lty=1, col="red", lwd=2)
abline(h = -log10(max(results2$results[results2$results[,5] < 0.05,3])), lty=1, col="blue", lwd=2)

# cleanup
rm(results1)

Manhattan plot for cell-type adjusted EWAS

the function manhattan needs data.frame including CpG, Chr, MapInfo and Pvalues

data(IlluminaHumanMethylation450kanno.ilmn12.hg19)
IlluminaAnnot <- data.frame(
  chr=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Locations$chr,
  pos=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Locations$pos,
  Relation_to_Island=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Islands.UCSC$Relation_to_Island,
  UCSC_RefGene_Name=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$UCSC_RefGene_Name)
# UCSC_RefGene_Group=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$UCSC_RefGene_Group,
# probeA=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Manifest$ProbeSeqA,
# probeB=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Manifest$ProbeSeqB,
# Forward_Sequence=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$Forward_Sequence,
# SourceSeq=IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Other$SourceSeq)

Create CpG name and annotate row names

rownames(IlluminaAnnot) <- rownames(IlluminaHumanMethylation450kanno.ilmn12.hg19@data$Manifest)
IlluminaAnnot$Name <- rownames(IlluminaAnnot)
dim(IlluminaAnnot)
## [1] 485512      5
IlluminaAnnot <- IlluminaAnnot [intersect(rownames(IlluminaAnnot), rownames(betas.clean)),]
dim(IlluminaAnnot)
## [1] 427140      5

look up the hits from above

IlluminaAnnot[IlluminaAnnot$Name %in% results2$FDR.sig$CPG.Labels,]
##              chr       pos Relation_to_Island UCSC_RefGene_Name       Name
## cg25294185 chr11  65487814             Island          RNASEH2C cg25294185
## cg03691818 chr12  53085038            OpenSea             KRT77 cg03691818
## cg12691488  chr1 243053673             Island                   cg12691488
## cg11643285  chr3  16411667            OpenSea             RFTN1 cg11643285
## cg25304146 chr18  30092971            OpenSea           WBP11P1 cg25304146

combining annotation and results for plotting

datamanhat <- data.frame(CpG=results2$results[,1],Chr=as.character(IlluminaAnnot$chr),
                         Mapinfo=IlluminaAnnot$pos,Pval=results2$results[,3])

Reformat the variable Chr (so we can simplify and use a numeric x-axis)

datamanhat$Chr <- as.numeric(sub("chr", "", datamanhat$Chr))

Manhattan plot

manhattan(datamanhat,"Chr","Mapinfo", "Pval", "CpG", main="Manhattan Plot - adj for CellProp")

cleanup

rm(j, nCpG, CpG.name, CpG.level, datamanhat, IlluminaAnnot, IlluminaHumanMethylation450kanno.ilmn12.hg19, lambda)

End of script 02

Regional DNA methylation analysis using DMRcate

Using data preprocessed in our script:
meth01_process_data.R & meth02_process_data.R

we have already set up our analysis

if(!exists("pheno")){
  source("code/meth02_analyze_data.R")
}

Load package for regional analysis “DMRcate” see Peters et al. Bioinformatics 2015. https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/1756-8935-8-6 Other popular options for conducting Regional DNA methylation analysis in R are Aclust and bumphunter

suppressMessages(library(DMRcate)) # Popular package for regional DNA methylation analysis

First we need to define a model

model <- model.matrix(~as.factor(pheno$Sex)+
                        as.numeric(pheno$CD8T)+
                        as.numeric(pheno$NK)+
                        as.numeric(pheno$Bcell)+
                        as.numeric(pheno$Mono)+
                        as.numeric(pheno$Gran)+
                        as.numeric(pheno$nRBC))

Let’s run the regional analysis using the Mvals from our preprocessed data

myannotation <- cpg.annotate("array", Mvals.ComBat, analysis.type="differential",
                             design=model, coef=2)
## Your contrast returned 8161 individually significant probes. We recommend the default setting of pcutoff in dmrcate().

Regions are now agglomerated from groups of significant probes where the distance to the next consecutive probe is less than lambda nucleotides away

dmrcoutput.sex <- suppressMessages(dmrcate(myannotation, lambda=1000, C=2))

Let’s look at the results

head(dmrcoutput.sex$results)
##                         coord no.cpgs minfdr      Stouffer maxbetafc
## 1404 chrX:139583634-139590479      31      0 2.403026e-171 0.5359809
## 433    chrX:48754200-48756592      27      0 2.440320e-147 0.4763541
## 667    chrX:66762467-66766506      29      0 7.825713e-132 0.4929935
## 1616 chrX:153774721-153776358      21      0 5.017214e-130 0.5240904
## 686    chrX:68046595-68050216      31      0 4.648406e-129 0.5833820
## 1454 chrX:149529976-149534258      19      0 1.413779e-121 0.4663069
##      meanbetafc
## 1404  0.3020102
## 433   0.3128542
## 667   0.1530296
## 1616  0.3460454
## 686   0.2828564
## 1454  0.3312878

Visualizing the data can help us understand where the region lies relative to promoters, CpGs islands or enhancers Let’s extract the genomic ranges and annotate to the genome

results.ranges <- extractRanges(dmrcoutput.sex, genome = "hg19")

Plot the DMR using the Gviz if you are interested in plotting genomic data the Gviz is extremely useful Let’s look at the first region

results.ranges[1]
## GRanges object with 1 range and 6 metadata columns:
##                            seqnames                 ranges strand |
##                               <Rle>              <IRanges>  <Rle> |
##   chrX:139583634-139590479     chrX [139583634, 139590479]      * |
##                              no.cpgs    minfdr      Stouffer maxbetafc
##                            <integer> <numeric>     <numeric> <numeric>
##   chrX:139583634-139590479        31         0 2.403026e-171 0.5359809
##                            meanbetafc overlapping.promoters
##                             <numeric>           <character>
##   chrX:139583634-139590479  0.3020102              SOX3-001
##   -------
##   seqinfo: 14 sequences from an unspecified genome; no seqlengths
pheno$sex<-ifelse(pheno$Sex==1,"Female","Male")
groups <- c(Female="magenta", Male="forestgreen")
cols <- groups[as.character(pheno$sex)]
DMR.plot(ranges=results.ranges, dmr=1, CpGs=betas.rcp, phen.col=cols, genome="hg19")

Most DMRs are located within sex-chromosomes Let’s look at autosomal chromosomes only We now perform the regional analysis on the data without sex-chromosomes

Mvals.clean <- log2(betas.clean)-log2(1-betas.clean)
myannotation.clean <- cpg.annotate("array", Mvals.clean, analysis.type="differential",
                                   design=model, coef=2)
## Your contrast returned 3 individually significant probes; a small but real effect. Consider manually setting the value of pcutoff to return more DMRs, but be warned that doing this increases the risk of Type I errors.
dmrcoutput.clean <- suppressMessages(dmrcate(myannotation.clean, lambda=1000, C=2))
head(dmrcoutput.clean$results)
##                    coord no.cpgs
## 2 chr6:31650735-31650850       7
##                                                 minfdr  Stouffer
## 2 0.00000000000000000000000000000000000000000000140421 0.9912497
##    maxbetafc meanbetafc
## 2 -0.4383369 -0.3389944

There’s a small bug on the extractRanges function that needs two rows for the $results output

dmrcoutput.clean$results<-rbind(dmrcoutput.clean$results,dmrcoutput.clean$results)
results.ranges <- extractRanges(dmrcoutput.clean, genome = "hg19")
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2 --> row.names NOT used
results.ranges
## GRanges object with 2 ranges and 6 metadata columns:
##      seqnames               ranges strand |   no.cpgs
##         <Rle>            <IRanges>  <Rle> | <integer>
##    2     chr6 [31650735, 31650850]      * |         7
##   21     chr6 [31650735, 31650850]      * |         7
##                                                    minfdr  Stouffer
##                                                 <numeric> <numeric>
##    2 0.00000000000000000000000000000000000000000000140421 0.9912497
##   21 0.00000000000000000000000000000000000000000000140421 0.9912497
##       maxbetafc meanbetafc
##       <numeric>  <numeric>
##    2 -0.4383369 -0.3389944
##   21 -0.4383369 -0.3389944
##                                           overlapping.promoters
##                                                     <character>
##    2 LY6G5C-203, LY6G5C-001, LY6G5C-007, LY6G5C-002, LY6G5C-008
##   21 LY6G5C-203, LY6G5C-001, LY6G5C-007, LY6G5C-002, LY6G5C-008
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
DMR.plot(ranges=results.ranges, dmr=1, CpGs=betas.clean, phen.col=cols, genome="hg19")

End of script 03